175904 - Jorge III Altamirano Astorga
177508 - Uriel Miranda Miñón
Se cargan los datos originales en formato JSON y se separan datos de entrenamiento y validación; existe además forma de datos de prueba adicionales en la prueba de Kaggle. :-)
set.seed(175904)
train_raw <- fromJSON("data/train.json")
train_raw <- train_raw %>% as.data.frame
train_raw$train <- F
train_raw[sample(nrow(train_raw), nrow(train_raw)*.7, replace = F), 4] <- T
train <- train_raw[train_raw$train == T,1:3]
valid <- train_raw[train_raw$train == F,1:3]
train_2 <- train
valid_2 <- valid
ggplot(train_raw, aes(x=train)) +
geom_histogram(stat="count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Se limpian los datos de palabras que consideramos que no añaden valor a la predicción. De los cuales consideramos
Así mismo se condensaron los datos por palabras clave, los cuales consideramos ordinales en importancia:
Ejemplo de condensación de datos:
Todo esto derivado de estudiar el dataset con el fin de reducir el número de variables.
Por último sacamos las frecuencias y eliminamos los datos con \(n\) ocurrencias, en nuestro caso determinamos los suficientes para tener alrededor de 100 variables.
unimportant_words <- "(^a taste of|any|low-fat|low ?salt|all|powder|baby|bertolli|boiled|boiling|bone in|whole|boneless|bottled|sauvignon|california|campbells condensed|canned|chopped|flavored carbonated beverage|cold|condensed|cooked|cooking|cereal|lowfat|frosting|spread|soften|with chives.*|ic peach| of .*|creamed|creamy| mexican.?|crushed|crystal farms|shredded|crystallized|crystal hot|cubed|curly|curlyleaf|jelly|dessert mix|sauce mix|mix|dark|dellallo|deepfried|deep fried|diced|diet|tortilla chips||domino|dried|minced|dry|earth balance|elmlea|^english|evaporated|everglades|extra fine|extra firm|extra large|extra\\s?lean|extra light|extra sharp|extra\\s?virgin|extra wide|^fat|fat\\s?free|fat skimmed|fatfree?|fattrimmed|fine|firm|firmly packed|flat |^flavored|terrine|food|free\\s?range|^french|^fresh| root|^fresno|^fried|^frozen|^fuji|full\\s?fat|gluten\\s?free|s milk |^gold|golden|gourmet|graham|granulated|grassfed|grated|grating|gravenstein|great|greater|style|green|grilled|grill|ground|half|heavy|heinz|hellmanns?|of the woods|herbed |herdez|hidden valley|homemade|^hot |hot smoked|hot spanish|^hungarian|hurst|i cantbelieve? its? not butter|imitation|imperial sugar light brown|instant |^irish|^italian|italianstyle|^japanese |jimmy dean|johnsonville|jose cuervo|jumbo|kikkoman|knorr|knudsen|kraft|mexican style|kraft zesty|slim cut|sun dried|shredded|la victoria|land o lakes|^large|^lean|leftover|leg of|zest|less sodium|lesser|leaves|^light|cook and drain|lipton|liquid |^lite|^long |loosely packed fresh|low fat|lowfat|^low sodium|lower sodium|lowfat|baked|\\sdeli|firm silken|styl|lowsodium|and cheese dinner|madagascar bourbon|extract|mccormick|^medium|uncook|uncooked|merguez|^mexican|minced|mini|mini|mixed|mixture|mizkan|^mms|mrs dash|natural|^nido|non dairy|non fat|non stick|nondairy|nonfat|frozen|nonhydrogenated|nosaltadded|old el paso|old|old\\s?fashioned|cooking spray|flavored|^organic|oscar mayer|other|oven\\s?ready|flavor|flavour|paella|reggiano|peeled|^petite|pillsbury|powdered|prepared|preserv|preserved|progresso|\\sdi\\sparma|pt|pte|puff|puffed|pure|quickcooking|quick|cooking|raw|red|reduced\\sfat|reduced\\ssodium|reduced\\ssugar|reducedfat|reducedsodium|reducedsugar|refrigerated|regular|rich|roasted|roast|roasting|robert mondavi|salt free seasoning|salt free chili powder|salt free cajun creole seasoning|salt free southwest chipotle seasoning|salt free herb seasoning|salt free chili powder|salted|saltines?|saltpeper|san marazano|sargento|links|casings|savoy|seafood|seasoned|seasoning|seedless|self ?ra?ising|shredded|single|simple|skinless|sliced|small|smoked|sodium free|sodium reduced|soft|softened|solid|southern comfort.*|southwest|sparkling|spicy|splenda.*|split|spring water|^strip|superfine|sweetened|taco bell.*|into serving pieces|to\\s+lb|toasted|uncle bens|^uncook|^uncooked|unflavou?red|unsweetened|white|wholesome sweeteners|wholemilk|wide|^wild|^winter|wish\\s?bone|yellow|young|zesty|part ?skim|italian|all ?purpose|puree|juice|aged|tuna in water|liqueur|liquor|^asian|and .*|yoplait|greek|fresh|spray|hot water|warm water|crumbles|freshly|flakes?|unsalt|unsalted|wedges?|plain)(\\s|$)"
popular_words <- function(a){
a <- gsub("(.*)(beans?|lettuce|olives?|tabasco|potato(es)?|cilantro|wheat|shiitake|lemon|chives?|tomato(es)?|cabbage|peanut|yogh?o?urt|rice|onions?|ginger|sesame|jalapeno|stock|bacon|monterey_jack|vinegar|sausages?|mozz?arell?a|monterey_jack|feta|ricotta|dijon|masala|eggs?|coconut_milk|cheddar|dijon|parmesan|sour_(crema|cream)|steak|pork|beef|chicken|oyster|garlic|salt|curry).*", "\\2", a)
}
#limpiar datos de ingredientes
train_2$ingredients <- sapply(1:nrow(train), function(x) {
train[x,3] %>%
unlist %>%
tolower %>%
gsub("\\([^)]*\\)", "", ., perl = T, ignore.case = T) %>%
gsub("[^ a-z]", "", ., perl = T, ignore.case = T) %>%
gsub("^\\s+", " ", ., perl = T, ignore.case = T) %>%
gsub(unimportant_words, " ", ., perl = T, ignore.case = T) %>%
gsub("\\s+", " ", ., perl = T, ignore.case = T) %>%
unique %>%
trimws %>%
gsub("\\s", "_", ., perl = T, ignore.case = T) %>%
popular_words
})
#limpiar los datos de validación
valid_2$ingredients <- sapply(1:nrow(valid), function(x) {
valid[x,3] %>%
unlist %>%
tolower %>%
gsub("\\([^)]*\\)", "", ., perl = T, ignore.case = T) %>%
gsub("[^ a-z]", "", ., perl = T, ignore.case = T) %>%
gsub("^\\s+", " ", ., perl = T, ignore.case = T) %>%
gsub(unimportant_words, " ", ., perl = T, ignore.case = T) %>%
gsub("\\s+", " ", ., perl = T, ignore.case = T) %>%
unique %>%
trimws %>%
gsub("\\s", "_", ., perl = T, ignore.case = T) %>%
popular_words
})
train_2$cuisine <- train_2$cuisine %>% as.factor
valid$cuisine <- valid$cuisine %>% as.factor
ingredients <- train_2$ingredients %>%
unlist
ingredients <- ingredients[which(!grepl(pattern = "^$", x = ingredients))] #%>%
# unique
ingredients %>% head(n=10)
## [1] "lettuce" "olives" "tomatoes" "garlic" "pepper" "onion"
## [7] "beans" "feta" "flour" "pepper"
ingredients_df <- as.data.frame(ingredients, stringsAsFactors = F) %>%
group_by(ingredients)
ingredients_count <- ingredients_df %>%
plyr::count(.) %>%
arrange(freq)
ingredients_count$id <- 1:nrow(ingredients_count)
head(ingredients_count)
tail(ingredients_count)
ingredients_top <- ingredients_count %>%
filter(freq > 150) %>%
arrange(desc(freq)) %>%
select(ingredients)
train_3 <- train_2[,1:2]
valid_3 <- valid_2[,1:2]
#crea las columnas
train_3[, ingredients_top %>% unlist] <- 0
valid_3[, ingredients_top %>% unlist] <- 0
#llena las columnas
for(i in 1:nrow(train_3)){
train_3[i, which(names(train_3) %in% (train_2[i,]$ingredients %>% unlist))] <- 1
}
for(i in 1:nrow(valid_3)){
valid_3[i, which(names(valid_3) %in% (valid_2[i,]$ingredients %>% unlist))] <- 1
}
rm(i)
saveRDS(train_2, "data/train_2.rds")
saveRDS(train_3, "data/train_3.rds")
saveRDS(valid_2, "data/valid_2.rds")
saveRDS(valid_3, "data/valid_3.rds")
# train_2 <- readRDS("data/train_2.rds")
# train_3 <- readRDS("data/train_3.rds")
# valid_2 <- readRDS("data/valid_2.rds")
# valid_3 <- readRDS("data/valid_3.rds")
train_3 %>%
select(id,cuisine,garlic,salt,pepper) %>%
head(n=5)
valid_3 %>%
select(id,cuisine,garlic,salt,pepper) %>%
head(n=5)
ggplot(ingredients_count %>% filter(freq > 100),aes(x=id, y=log(freq)))+geom_line() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
n_ing <- train_2$ingredients
# a_ing <-
train_2$n_ing <- sapply(1:length(n_ing),function(i){length(n_ing[[i]])})
ggplot(train_2 , aes(x=cuisine,y=n_ing)) +
geom_boxplot() +
theme(axis.text.x=element_text(angle=90,hjust=1))
rm(n_ing)
data.frame( Cuisine = unique(train_2$cuisine),
Mean = aggregate(train_2$n_ing, list(train_2$cuisine), mean)[,2],
SD = aggregate(train_2$n_ing, list(train_2$cuisine), sd)[,2],
Min = aggregate(train_2$n_ing, list(train_2$cuisine), min)[,2],
Max = aggregate(train_2$n_ing, list(train_2$cuisine), max)[,2])
ggplot(train_2, aes(n_ing, group = cuisine)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_y_continuous(labels=scales::percent) +
ylab("relative frequencies") +
facet_wrap(~cuisine)
Vemos que la cocina mexicana es una cocina muy popular. :-)
ggplot((train_2$cuisine %>% plyr::count()), aes(x = reorder(x, -freq), y = freq)) +
geom_bar(stat = "identity") +
xlab("Cuisine") +
ylab("Frecuencias") +
theme(axis.text.x=element_text(angle=90,hjust=1))
train_2 <- readRDS("data/train_2.rds")
ingredients_count <- readRDS("data/ingredients_count.Rdata")
ingredients_graph <- train_2 %>%
unnest
top_ing <- ingredients_count[(nrow(ingredients_count)-100):nrow(ingredients_count),]
ggplot(ingredients_graph[ingredients_graph$ingredients %in% top_ing$ingredients,], aes(x = ingredients, fill = cuisine)) + geom_histogram(stat = "count" ) + theme(axis.text.x=element_text(angle=90,hjust=1))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Predice súmamente mal, incluso con el conjunto de entrenamiento. Debido a esto, exploramos otros algoritmos.
# arbol_grande <- rpart(cuisine ~ ., data= train_3[,-1], cp=0)
# save(arbol_grande, file = "data/arbol.Rdata")
load(file = "data/arbol.Rdata")
prp(prune(arbol_grande, cp=0.03), type=4, extra=1, digits=3)
train_3$arbol <- predict(arbol_grande, newdata = train_3[,3:210], type="class")
train_3 %>% mutate(arbol_pred = (cuisine == arbol)) %>% select(arbol_pred) %>% summary
## arbol_pred
## Mode :logical
## FALSE:10294
## TRUE :17547
train_3 %>% select(cuisine, arbol)
# bosque <- foreach(ntree=rep(150, 3), .combine=combine, .multicombine=TRUE,
# .packages='randomForest') %dopar% {
# randomForest(cuisine ~ . , data = train_3[,2:210], ntree=ntree)
# }
# save(bosque, file="data/bosque.Rdata")
load("data/bosque.Rdata")
train_3$bosque <- predict(bosque, newdata = train_3[,3:210], type="class")
train_3 %>% mutate(bosque_pred = (cuisine == bosque)) %>% select(bosque_pred) %>% summary
## bosque_pred
## Mode :logical
## FALSE:2681
## TRUE :25160
train_3 %>% select(cuisine, bosque)
load("data/bosque.Rdata")
valid_3$bosque <- predict(bosque, newdata = valid_3[,3:210], type="class")
valid_3 %>% mutate(bosque_pred = (cuisine == bosque)) %>% select(bosque_pred) %>% summary
## bosque_pred
## Mode :logical
## FALSE:3621
## TRUE :8312
valid_3 %>% select(cuisine, bosque)
# set.seed(175904)
# svm <- parallelSVM(cuisine ~ . , data = train_3[,2:210],
# numberCores = detectCores()-1,
# samplingSize = 0.2,
# na.action = na.omit,
# scale = TRUE)
# save(svm, file = "data/svm.Rdata")
load(file = "data/svm.Rdata")
train_3$svm <- predict(svm, newdata = train_3[,3:210], type="class")
train_3 %>% mutate(svm_pred = (cuisine == svm)) %>% select(svm_pred) %>% summary
## svm_pred
## Mode :logical
## FALSE:8666
## TRUE :19175
train_3 %>% select(cuisine, svm)
# valid_3 <- readRDS("data/valid_3.rds")
# load(file = "data/svm.Rdata")
valid_3$svm <- predict(svm, newdata = valid_3[,3:210], type="class")
valid_3$bosque <- predict(bosque, newdata = valid_3[,3:210], type="class")
valid_3 %>% mutate(svm_pred = (cuisine == svm)) %>% select(svm_pred) %>% summary
## svm_pred
## Mode :logical
## FALSE:4050
## TRUE :7883
valid_3 %>% select(cuisine, svm)
valid_3a <- valid_3%>%select(cuisine,svm,bosque)
valid_3a <- valid_3a %>% group_by(cuisine) %>% mutate("bosque" = (cuisine == bosque))
valid_3a <- valid_3a %>% group_by(cuisine) %>% mutate("svm" = (cuisine == svm))
data_valid <- summarise(valid_3a,
bosque=mean(bosque), svm=mean(svm))
g1 <- ggplot(data_valid , aes(x=reorder(cuisine, -bosque), y=bosque)) +
geom_bar(stat="identity") + xlab("cuisine") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
g2 <- ggplot(data_valid , aes(x=reorder(cuisine, -svm), y=svm)) +
geom_bar(stat="identity") + xlab("cuisine") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
grid.arrange(g1,g2)
glimpse(valid_3)
## Observations: 11,933
## Variables: 212
## $ id <int> 20130, 22213, 42779, 3735, 45887, 2698, 1...
## $ cuisine <chr> "filipino", "indian", "spanish", "italian...
## $ salt <dbl> 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,...
## $ garlic <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,...
## $ onions <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,...
## $ olive <dbl> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,...
## $ chicken <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,...
## $ pepper <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,...
## $ sugar <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ tomatoes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ water <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ butter <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ black_pepper <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ onion <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ eggs <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ flour <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ lemon <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cilantro <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ginger <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ vinegar <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ rice <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,...
## $ vegetable_oil <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ soy_sauce <dbl> 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,...
## $ lime <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ milk <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ beans <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ cumin <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sesame <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ bell_pepper <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ egg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ carrots <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ parmesan <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ beef <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ parsley <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ oregano <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ basil <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chili <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tomato <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ potatoes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ brown_sugar <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ oil <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pork <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ wine <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ shrimp <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ thyme <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ stock <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ sour_cream <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cream <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cinnamon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cheddar <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ scallions <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ baking <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ vanilla <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ jalapeno <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ corn_starch <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ peanut <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ coriander <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cayenne_pepper <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ paprika <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ curry <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ shallots <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ fish_sauce <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ mozzarella <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ olives <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ celery <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ honey <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ spinach <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ avocado <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cabbage <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ coconut_milk <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ canola_oil <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bacon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ lettuce <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sausage <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ mushrooms <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bay <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ mint <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ orange <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ yogurt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ nutmeg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ flat_leaf_parsley <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ masala <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ corn_tortillas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ buttermilk <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cucumber <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ salsa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ baking_soda <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ steak <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ flour_tortillas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cumin_seed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ turmeric <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ peas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chilies <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ whipping_cream <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ zucchini <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bay_leaf <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cheese <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ mayonaise <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ cream_cheese <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tumeric <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ rosemary <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bread_crumbs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ corn <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ raisins <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ricotta <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ wheat <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ worcestershire_sauce <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ hot_sauce <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chili_peppers <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cinnamon_sticks <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ feta <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chives <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ bread <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ monterey_jack <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ clove <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ shiitake <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ celery_ribs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ oyster <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ dijon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tofu <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ capers <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bean <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cornmeal <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ allspice <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ almonds <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ mirin <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ taco <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chile <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ cayenne <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sauce <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ vegetable_broth <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ black_peppercorns <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ coconut <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pecans <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ mustard <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cloves <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ketchup <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ hoisin_sauce <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cracked_black_pepper <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cardamom <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ corn_kernels <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ confectioners_sugar <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ potato <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sage <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ leeks <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ham <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ active_yeast <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chickpeas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sausages <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cajun <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ parmigiano_cheese <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ enchilada_sauce <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sherry <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pasta <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ coriander_seeds <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ dill <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cocoa <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sake <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pinenuts <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ corn_meal <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ spaghetti <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chiles <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chile_pepper <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ fennel_seeds <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tomatillos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ turkey <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ baguette <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ hot_pepper_sauce <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ mustard_seeds <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ lamb <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ okra <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pineapple <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ prosciutto <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sriracha <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ melted_butter <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ serrano_chile <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ peppers <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ creole <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ yoghurt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ghee <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ walnuts <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ peaches <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ radishes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ beer <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ lasagna_noodles <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pasta_sauce <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ grits <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ asparagus <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ corn_syrup <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ mango <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ shaoxing_wine <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ saffron_threads <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ vegetables <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ star_anise <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ lentils <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ yeast <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ broccoli <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chinese_fivespice <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ marinara_sauce <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ linguine <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ shortening <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ noodles <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ hot_pepper <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ margarine <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cardamom_pods <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tarragon <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ strawberries <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ coconut_oil <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bananas <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chipotle_chile <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bosque <fctr> chinese, indian, mexican, italian, chine...
## $ svm <fctr> chinese, indian, italian, italian, chine...
test_raw <- fromJSON("data/test.json")
test_raw <- test_raw %>% as.data.frame
test <- test_raw
# test_raw$ingredients %>% unlist
test$ingredients <- sapply(1:nrow(test_raw), function(x) {
test[x,2] %>%
unlist %>%
tolower %>%
gsub("\\([^)]*\\)", "", ., perl = T, ignore.case = T) %>%
gsub("[^ a-z]", "", ., perl = T, ignore.case = T) %>%
gsub("^\\s+", " ", ., perl = T, ignore.case = T) %>%
gsub(unimportant_words, " ", ., perl = T, ignore.case = T) %>%
gsub("\\s+", " ", ., perl = T, ignore.case = T) %>%
unique %>%
trimws %>%
gsub("\\s", "_", ., perl = T, ignore.case = T) %>%
popular_words
})
test
test_2 <- test[,1] %>% data_frame(id=.)
#crea las columnas
test_2[, ingredients_top$ingredients] <- 0
#llena las columnas
for(i in 1:nrow(test_2)){
test_2[i, which(names(test_2) %in% (test[i,]$ingredients %>% unlist))] <- 1
}
test_2